{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "## A simple Hidden Markov Model Postagger\n",
    "\n",
    "You need to implement two parts of the HMM postagger.\n",
    "- A HMM model\n",
    "- viterbi decoding\n",
    "\n",
    "Keep in the following things in mind:\n",
    "- probability smoothing when estimating model parameters\n",
    "- (optional) tune hyperparameter on development set\n",
    "\n",
    "You should get an accuracy of more than **67.0** with proper discounting strategy."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# First, you need to implement you parameter estimation part.\n",
    "from collections import Counter\n",
    "from math import log\n",
    "\n",
    "class HMM(object):\n",
    "    def __init__(self, epsilon=1e-5, training_data=None):\n",
    "        self.epsilon = epsilon\n",
    "        if training_data is not None:\n",
    "            self.fit(training_data)\n",
    "\n",
    "\n",
    "    def fit(self, training_data):\n",
    "        '''\n",
    "        Counting the number of unigram, bigram, cooc and wordcount from the training\n",
    "        data.\n",
    "        \n",
    "        Parameters\n",
    "        ----------\n",
    "        training_data: list\n",
    "            A list of training data, each element is a tuple with words and postags.\n",
    "        '''\n",
    "        self.unigram = Counter()    # The count of postag unigram, e.g. unigram['NN']=5\n",
    "        self.bigram = Counter()     # The count of postag bigram, e.g. bigram[('PRP', 'VV')]=1\n",
    "        self.cooc = Counter()       # The count of word, postag, e.g. cooc[('I', 'PRP')]=1\n",
    "        self.wordcount = Counter()  # The count of word, e.g. word['I']=1\n",
    "    \n",
    "        print('building HMM model ...')\n",
    "        for words, tags in training_data:\n",
    "            # Your code here! You need to implement the ngram counting part. Please count\n",
    "            # - unigram\n",
    "            # - bigram\n",
    "            # - cooc\n",
    "            # - wordcount\n",
    "\n",
    "        print('HMM model is built.')\n",
    "        self.postags = [k for k in self.unigram]\n",
    "\n",
    "            \n",
    "    def emit(self, words, i, tag):\n",
    "        '''\n",
    "        Given a word and a postag, give the log emission probability of P(word|tag)\n",
    "        Please refer the `foundation of statistial natural language processing`, Chapter 10\n",
    "        \n",
    "        Parameters\n",
    "        ----------\n",
    "        words: list(str)\n",
    "            The list of words\n",
    "        i: int\n",
    "            The ith word\n",
    "        tag: str    \n",
    "            The postag\n",
    "            \n",
    "        Returns\n",
    "        -------\n",
    "        prob: float\n",
    "            The log probability\n",
    "        '''\n",
    "        # Your code here! You need to implement the log emission probability part.\n",
    "        prob = \n",
    "        return prob\n",
    "    \n",
    "    \n",
    "    def trans(self, tag, tag1):\n",
    "        '''\n",
    "        Given two postags, give the log transition probability of P(tag1|tag)\n",
    "        Please refer the `foundation of statistial natural language processing`, Chapter 10\n",
    "        \n",
    "        Parameters\n",
    "        ----------\n",
    "        tag: str\n",
    "            The previous postag\n",
    "        tag1: str    \n",
    "            The current postag\n",
    "            \n",
    "        Returns\n",
    "        -------\n",
    "        prob: float\n",
    "            The log probability\n",
    "        '''\n",
    "        # Your code here! You need to implement the log transition probability part.\n",
    "        prob = \n",
    "        return prob"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# The tiny example.\n",
    "training_dataset = [(['dog', 'chase', 'cat'], ['NN', 'VV', 'NN']),\n",
    "                    (['I', 'chase', 'dog'], ['PRP', 'VV', 'NN']),\n",
    "                    (['cat', 'chase', 'mouse'], ['NN', 'VV', 'NN'])\n",
    "                   ]\n",
    "\n",
    "hmm = HMM(training_data=training_dataset)\n",
    "\n",
    "# Testing if the parameter are correctly estimated.\n",
    "assert hmm.unigram['NN'] == 5\n",
    "assert hmm.bigram['VV', 'NN'] == 3\n",
    "assert hmm.bigram['NN', 'VV'] == 2\n",
    "assert hmm.cooc['dog', 'NN'] == 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# We implement the viterbi decoding algorithm.\n",
    "def viterbi(words, hmm):\n",
    "    '''\n",
    "    Viterbi algorihtm.\n",
    "    \n",
    "    Parameters\n",
    "    ----------\n",
    "    words: list(str)\n",
    "        The list of words\n",
    "    hmm: HMM\n",
    "        The hmm model\n",
    "        \n",
    "    Return\n",
    "    ------\n",
    "    result: list(str)\n",
    "        The POS-tag for each word.\n",
    "    '''\n",
    "    # unpack the length of words, and number of postags\n",
    "    N, T = len(words), len(hmm.postags)\n",
    "    \n",
    "    # allocate the decode matrix\n",
    "    score = [[-float('inf') for j in range(T)] for i in range(N)]\n",
    "    path = [[-1 for j in range(T)] for i in range(N)]\n",
    "    \n",
    "    for i, word in enumerate(words):\n",
    "        if i == 0:\n",
    "            for j, tag in enumerate(hmm.postags):\n",
    "                score[i][j] = hmm.emit(words, i, tag)\n",
    "        else:\n",
    "            for j, tag in enumerate(hmm.postags):\n",
    "                best, best_t = -1e20, -1\n",
    "                # Your code here, enumerate all the previous tag\n",
    "                \n",
    "                score[i][j] = best\n",
    "                path[i][j] = best_t\n",
    "\n",
    "    #\n",
    "    best, best_t = -1e20, -1\n",
    "    for j, tag in enumerate(postags):\n",
    "        if best < score[len(words)- 1][j]:\n",
    "            best = score[len(words)- 1][j]\n",
    "            best_t = j\n",
    "\n",
    "    result = [best_t]\n",
    "    for i in range(len(words)-1, 0, -1):\n",
    "        # Your code here, back trace to recover the full viterbi decode path\n",
    "    \n",
    "    # convert POStag indexing to POStag str\n",
    "    result = [hmm.postags[t] for t in reversed(result)]\n",
    "    return result"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# Test with tiny example.\n",
    "testing_dataset = [['dog', 'chase', 'mouse'],\n",
    "                  ['I', 'chase', 'dog']]\n",
    "\n",
    "for testing_data in testing_dataset:\n",
    "    tags = viterbi(testing_data, hmm)\n",
    "    print tags"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "from dataset import read_dataset\n",
    "\n",
    "train_dataset = read_dataset('./penn.train.pos.gz')\n",
    "devel_dataset = read_dataset('./penn.devel.pos.gz')\n",
    "\n",
    "print('%d is training sentences.' % len(train_dataset))\n",
    "print('%d is development sentences.' % len(devel_dataset))\n",
    "\n",
    "hmm.fit(train_dataset)\n",
    "\n",
    "n_corr, n_total = 0, 0\n",
    "for devel_data_x, devel_data_y in devel_dataset:\n",
    "    pred_y = viterbi(devel_data_x, hmm)\n",
    "\n",
    "    for pred_tag, corr_tag in zip(pred_y, devel_data_y):\n",
    "        if pred_tag == corr_tag:\n",
    "            n_corr += 1\n",
    "        n_total += 1\n",
    "\n",
    "print(\"accuracy=%f\" % (float(n_corr)/ n_total))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# Lets play with the HMM postagger\n",
    "print viterbi(['HMM', 'is', 'a', 'widely', 'used', 'model', '.'], hmm)\n",
    "print viterbi(['I', 'like', 'cat', ',', 'but', 'I', 'hate', 'eating', 'fish', '.'], hmm)\n",
    "\n",
    "# and more you example"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# Work around the test dataset\n",
    "from __future__ import print_function\n",
    "\n",
    "test_dataset = read_dataset('./penn.test.pos.blind.gz')\n",
    "\n",
    "fpo=open('./penn.test.pos.out', 'w')\n",
    "\n",
    "for test_data_x, test_data_y in test_dataset:\n",
    "    pred_y = viterbi(test_data_x, hmm)\n",
    "    print(\" \".join(y for y in pred_y), file=fpo)\n",
    "\n",
    "fpo.close()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
